# -*- encoding:utf8 -*-
# python2.7
import urllib2
from bs4 import BeautifulSoup


class DouBanSpider(object):
    def __init__(self):
        self.page = 1
        self.cur_url = "http://movie.douban.com/top250?start={page}&filter="
        print "豆瓣电影爬虫准备就绪, 准备爬取数据..."

    def get_content(self, movie_num):
        url = self.cur_url
        try:
            content = urllib2.urlopen(url.format(page=(movie_num - 1) * 25)).read().decode("utf-8")
        except urllib2.URLError, e:
            print e.reason
        return content

    def get_title(self, content):
        # movie_items = re.findall(r'<span.*?class="title">(.*?)</span>', content, re.S)
        soup = BeautifulSoup(content)
        # soup = BeautifulSoup(content, "lxml")
        print soup.title
        # < title >
        # 豆瓣电影TOP250
        # < / title >

        print soup.title.name
        # title

        print soup.title.string
        # 豆瓣电影TOP250

        print soup.title.parent.name
        # head

        print soup.p
        # <p class="appintro-title">豆瓣</p>

        print soup.p['class']
        # ['appintro-title']

        print soup.a
        # <a class="nav-login" href="https://www.douban.com/accounts/login?source=movie" rel="nofollow">登录</a>

        print soup.find_all('a')

    def start_job(self):
        content = self.get_content(self.page)
        self.get_title(content)


if __name__ == "__main__":
    crawler = DouBanSpider()
    crawler.start_job()
